Importing Libraries¶

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from pycountry_convert import country_alpha2_to_country_name,country_name_to_country_alpha3

Data's Basic Overview¶

In [2]:
df = pd.read_csv("ds_salaries.csv", index_col=0)
In [3]:
df
Out[3]:
work_year experience_level employment_type job_title salary salary_currency salary_in_usd employee_residence remote_ratio company_location company_size
0 2020 MI FT Data Scientist 70000 EUR 79833 DE 0 DE L
1 2020 SE FT Machine Learning Scientist 260000 USD 260000 JP 0 JP S
2 2020 SE FT Big Data Engineer 85000 GBP 109024 GB 50 GB M
3 2020 MI FT Product Data Analyst 20000 USD 20000 HN 0 HN S
4 2020 SE FT Machine Learning Engineer 150000 USD 150000 US 50 US L
... ... ... ... ... ... ... ... ... ... ... ...
602 2022 SE FT Data Engineer 154000 USD 154000 US 100 US M
603 2022 SE FT Data Engineer 126000 USD 126000 US 100 US M
604 2022 SE FT Data Analyst 129000 USD 129000 US 0 US M
605 2022 SE FT Data Analyst 150000 USD 150000 US 100 US M
606 2022 MI FT AI Scientist 200000 USD 200000 IN 100 US L

607 rows × 11 columns

In [4]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 607 entries, 0 to 606
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           607 non-null    int64 
 1   experience_level    607 non-null    object
 2   employment_type     607 non-null    object
 3   job_title           607 non-null    object
 4   salary              607 non-null    int64 
 5   salary_currency     607 non-null    object
 6   salary_in_usd       607 non-null    int64 
 7   employee_residence  607 non-null    object
 8   remote_ratio        607 non-null    int64 
 9   company_location    607 non-null    object
 10  company_size        607 non-null    object
dtypes: int64(4), object(7)
memory usage: 56.9+ KB
In [5]:
sns.heatmap(df.isnull())
Out[5]:
<AxesSubplot: >
In [6]:
df.describe()
Out[6]:
work_year salary salary_in_usd remote_ratio
count 607.000000 6.070000e+02 607.000000 607.00000
mean 2021.405272 3.240001e+05 112297.869852 70.92257
std 0.692133 1.544357e+06 70957.259411 40.70913
min 2020.000000 4.000000e+03 2859.000000 0.00000
25% 2021.000000 7.000000e+04 62726.000000 50.00000
50% 2022.000000 1.150000e+05 101570.000000 100.00000
75% 2022.000000 1.650000e+05 150000.000000 100.00000
max 2022.000000 3.040000e+07 600000.000000 100.00000

EDA by Columns¶

In [7]:
jobTitle = df["job_title"].value_counts() <= 2
In [8]:
jobTitle.loc[lambda x: x == True]
Out[8]:
Principal Data Analyst            True
ETL Developer                     True
Product Data Analyst              True
Director of Data Engineering      True
Financial Data Analyst            True
Cloud Data Engineer               True
Lead Machine Learning Engineer    True
NLP Engineer                      True
Head of Machine Learning          True
3D Computer Vision Researcher     True
Data Specialist                   True
Staff Data Scientist              True
Big Data Architect                True
Finance Data Analyst              True
Marketing Data Analyst            True
Machine Learning Manager          True
Data Analytics Lead               True
Name: job_title, dtype: bool

Note:¶

The following values aren't adequate for analysis, but it's not good to drop so many of the values,¶

so it's better to ignore them in the analysis.¶

In [9]:
df.work_year.nunique()
Out[9]:
3
In [10]:
plt.figure(figsize=(8,5))
px.histogram(df["work_year"], title = "Distribution of Year in the Dataset")
<Figure size 800x500 with 0 Axes>
In [11]:
px.bar(df.groupby("work_year")[["salary_in_usd"]].mean(), title="Average Salary (Year Wise)",
      color=df.groupby("work_year")["salary_in_usd"].mean(), color_continuous_scale=px.colors.sequential.Emrld)
In [12]:
df.groupby("work_year")[["salary_in_usd"]].mean() / 95813.000000 * 100
# Percentage increase in the salary of Data Science candidates
Out[12]:
salary_in_usd
work_year
2020 100.000000
2021 104.217374
2022 129.963581
In [13]:
px.bar(df.groupby("job_title")["salary_in_usd"].mean(), title="Average Salary by  Job Title",
       color=df.groupby("job_title")["salary_in_usd"].mean(), color_continuous_scale=px.colors.sequential.Aggrnyl_r)
In [14]:
px.bar(df.groupby("job_title")["salary_in_usd"].max(), orientation="h", title="Maximum Salary by Job Title")
In [15]:
px.bar(df.groupby("job_title")["salary_in_usd"].min(), orientation="h", title="Minimum Salary by Job Title")
In [16]:
df.groupby("work_year")["job_title"].value_counts()
Out[16]:
work_year  job_title                               
2020       Data Scientist                              21
           Data Engineer                               11
           Data Analyst                                 7
           Machine Learning Engineer                    5
           Big Data Engineer                            3
                                                       ..
2022       ML Engineer                                  1
           Machine Learning Infrastructure Engineer     1
           NLP Engineer                                 1
           Principal Data Analyst                       1
           Principal Data Scientist                     1
Name: job_title, Length: 98, dtype: int64
In [17]:
px.histogram(df["experience_level"], title="Experience of the Individuals", color=df["experience_level"])
In [18]:
px.bar(df.groupby("experience_level")["salary"].mean(), title="Average Salary as per Experience", 
       color=df.groupby("experience_level")["salary"].mean(), color_continuous_scale=px.colors.sequential.Peach)
In [19]:
px.histogram(df["employment_type"], color=df["employment_type"], title="Employment Type Distribution")
In [20]:
px.bar(df.groupby("employment_type")["salary"].mean(), color=df.groupby("employment_type")["salary"].mean(),
    color_continuous_scale=px.colors.sequential.Brwnyl, title="Average Salary based on Employment Type ")
In [21]:
df["company_location"].unique()
Out[21]:
array(['DE', 'JP', 'GB', 'HN', 'US', 'HU', 'NZ', 'FR', 'IN', 'PK', 'CN',
       'GR', 'AE', 'NL', 'MX', 'CA', 'AT', 'NG', 'ES', 'PT', 'DK', 'IT',
       'HR', 'LU', 'PL', 'SG', 'RO', 'IQ', 'BR', 'BE', 'UA', 'IL', 'RU',
       'MT', 'CL', 'IR', 'CO', 'MD', 'KE', 'SI', 'CH', 'VN', 'AS', 'TR',
       'CZ', 'DZ', 'EE', 'MY', 'AU', 'IE'], dtype=object)
In [22]:
df["company_location"] = df["company_location"].apply(lambda x: country_name_to_country_alpha3(country_alpha2_to_country_name(x)))
In [23]:
df["company_location"]
Out[23]:
0      DEU
1      JPN
2      GBR
3      HND
4      USA
      ... 
602    USA
603    USA
604    USA
605    USA
606    USA
Name: company_location, Length: 607, dtype: object
In [24]:
px.histogram(df["company_location"], log_y=True,hover_name=df["company_location"], title="Count of Companies Location")
In [25]:
avgSal_country = df.groupby("company_location")["salary_in_usd"].mean()
In [26]:
px.choropleth(df, locations=avgSal_country.index,
                    color=avgSal_country,
                            color_continuous_scale=px.colors.sequential.Plasma_r, title="Average Salary based on Company Location")
In [27]:
df["employee_residence"] = df["employee_residence"].apply(lambda x: country_name_to_country_alpha3(country_alpha2_to_country_name(x)))
In [28]:
px.histogram(df["employee_residence"],log_y = True , title="Employee Resident Country Distribution")
In [29]:
plt.figure(figsize=(6,6))
df["company_size"].value_counts().plot(kind="pie")
plt.legend()
Out[29]:
<matplotlib.legend.Legend at 0x1a6a6ef3940>
In [30]:
px.bar(df.groupby("company_size")["salary_in_usd"].mean(), 
       color=df.groupby("company_size")["salary_in_usd"].mean(), color_continuous_scale=px.colors.sequential.Aggrnyl_r, 
       title="Salary based on Company Size")
In [31]:
px.bar(df["salary_currency"].value_counts().head(5))
In [32]:
px.bar(df["remote_ratio"].value_counts() / 607 * 100)

Conclusions:¶

1. Majority of the Companies & Employees are US based.¶

2. There has been a 30% hike in the salary of Data Science field from 2020 to 2022.¶

3. Principal Data Engineer earns highest average salary of over 300K USD and also the highest salary at 600K USD.¶

4. Most of the individuals have the experience level SE and are Full Time employees, while the individuals with experience level MI earns the most.¶

5. Russia pays Data Science individuals the most with an average salary of 157.7K USD.¶

6. Most of the Companies are Medium and Large sized ,though both pays approximately equally.¶

7. 62 % of the job profiles have remote ratio of 100.¶

8. USD and Euros are the most used currencies for paying.¶

In [ ]:
 
In [ ]: